{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 文本分类模型微调的示例\n",
    "from transformers import AutoTokenizer, AutoModelForSequenceClassification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 加载数据集\n",
    "from datasets import load_dataset\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset({\n",
      "    features: ['label', 'review'],\n",
      "    num_rows: 734\n",
      "})\n"
     ]
    }
   ],
   "source": [
    "dataset = load_dataset(\"csv\", data_files=\"dataset/dataset_furina.csv\", split=\"train\")\n",
    "dataset = dataset.filter(lambda example: example[\"review\"] is not None and example[\"label\"] is not None)\n",
    "print(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['label', 'review'],\n",
       "        num_rows: 660\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['label', 'review'],\n",
       "        num_rows: 74\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 划分数据集\n",
    "datasets = dataset.train_test_split(test_size=0.1)\n",
    "datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c8661075a5e742579e835bdcd86c01ce",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/660 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2fda686a6d55435bb8f0496ef465143c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/74 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 660\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 74\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import torch\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"hflrbt3\")\n",
    "\n",
    "def process_function(examples):\n",
    "    # 暂时不填充, 组成batch时再填充\n",
    "    tokenized_example = tokenizer(examples[\"review\"], max_length=128, truncation=True) \n",
    "    tokenized_example[\"label\"] = examples[\"label\"]\n",
    "    return tokenized_example\n",
    "\n",
    "# 处理数据集, 把数据集转换为模型可以处理的格式(分词器编码后的格式)\n",
    "tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets[\"train\"].column_names)\n",
    "tokenized_datasets\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 建立DataLoader\n",
    "from torch.utils.data import DataLoader\n",
    "from transformers import DataCollatorWithPadding\n",
    "trainset = tokenized_datasets[\"train\"]\n",
    "validset = tokenized_datasets[\"test\"]\n",
    "\n",
    "trainloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer))  # 乱序, 一组大小为32\n",
    "validloader = DataLoader(validset, batch_size=32, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer)) # 不乱序\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'input_ids': tensor([[  101,  6763,  6763,  ...,     0,     0,     0],\n",
      "        [  101,   113,  4500,  ...,     0,     0,     0],\n",
      "        [  101,  1403,   519,  ...,     0,     0,     0],\n",
      "        ...,\n",
      "        [  101,  4500, 12619,  ...,     0,     0,     0],\n",
      "        [  101,   872,   791,  ...,     0,     0,     0],\n",
      "        [  101,  1146,  3358,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],\n",
      "        [0, 0, 0,  ..., 0, 0, 0],\n",
      "        [0, 0, 0,  ..., 0, 0, 0],\n",
      "        ...,\n",
      "        [0, 0, 0,  ..., 0, 0, 0],\n",
      "        [0, 0, 0,  ..., 0, 0, 0],\n",
      "        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],\n",
      "        [1, 1, 1,  ..., 0, 0, 0],\n",
      "        [1, 1, 1,  ..., 0, 0, 0],\n",
      "        ...,\n",
      "        [1, 1, 1,  ..., 0, 0, 0],\n",
      "        [1, 1, 1,  ..., 0, 0, 0],\n",
      "        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,\n",
      "        0, 0, 1, 1, 1, 0, 0, 1])}\n"
     ]
    }
   ],
   "source": [
    "for x in trainloader:\n",
    "    print(x)\n",
    "    break\n",
    "# next(enumerate(trainloader))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hflrbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "from torch.optim import AdamW\n",
    "# 导入模型\n",
    "model = AutoModelForSequenceClassification.from_pretrained(\"hflrbt3\")\n",
    "#改为三分类的模型\n",
    "# model.classifier = torch.nn.Linear(768, 3)\n",
    "# print(model)\n",
    "if torch.cuda.is_available():\n",
    "    model = model.cuda()\n",
    "# 定义优化器\n",
    "optimizer = AdamW(model.parameters(), lr=2e-6) # 1e-5是学习率, 迁移学习使用的一般比较低\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate():\n",
    "    \"\"\"\n",
    "    Description: 评估模型在验证集上的性能\n",
    "    Returns:\n",
    "        模型的准确率\n",
    "    \"\"\"\n",
    "    model.eval()\n",
    "    acc_num = 0\n",
    "    with torch.no_grad():\n",
    "        for batch in validloader:\n",
    "            if torch.cuda.is_available():\n",
    "                batch = {k:v.cuda() for k, v in batch.items()}\n",
    "            outputs = model(**batch)\n",
    "            pred = outputs.logits.argmax(dim=-1) # 预测的类别\n",
    "            acc_num += (pred == batch[\"labels\"].long()).float().sum().item()\n",
    "    return acc_num / len(validset)\n",
    "\n",
    "\n",
    "def train(epoch=15, log_step=100):\n",
    "    \"\"\"\n",
    "    Description: 训练模型\n",
    "    Args:\n",
    "        epoch (int, optional): 训练的次数. Defaults to 3.\n",
    "        log_step (int, optional): 打印log的步长. Defaults to 100.\n",
    "    \"\"\"\n",
    "    global_step=0\n",
    "    for ep in range(epoch):\n",
    "        model.train()\n",
    "        # 遍历训练集\n",
    "        for batch in trainloader:\n",
    "            # 将数据放到cuda上\n",
    "            if torch.cuda.is_available():\n",
    "                batch = {k:v.cuda() for k, v in batch.items()}\n",
    "            optimizer.zero_grad()\n",
    "            outputs = model(**batch)\n",
    "            outputs.loss.backward()\n",
    "            optimizer.step()\n",
    "            global_step += 1\n",
    "            if global_step % log_step == 0:\n",
    "                print(f\"epoch={ep}, global_step={global_step}, loss={outputs.loss.item()}\")\n",
    "        # 每个epoch结束评估一次\n",
    "        acc = evaluate()\n",
    "        print(f\"epoch={ep}, acc={acc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "before train 0.5945945945945946\n",
      "epoch=0, acc=0.6891891891891891\n",
      "epoch=1, acc=0.7567567567567568\n",
      "epoch=2, acc=0.8243243243243243\n",
      "epoch=3, acc=0.8513513513513513\n",
      "epoch=4, global_step=100, loss=0.5408715605735779\n",
      "epoch=4, acc=0.8783783783783784\n",
      "epoch=5, acc=0.918918918918919\n",
      "epoch=6, acc=0.9459459459459459\n",
      "epoch=7, acc=0.9594594594594594\n",
      "epoch=8, acc=0.9594594594594594\n",
      "epoch=9, global_step=200, loss=0.28677430748939514\n",
      "epoch=9, acc=0.9594594594594594\n",
      "epoch=10, acc=0.972972972972973\n",
      "epoch=11, acc=0.972972972972973\n",
      "epoch=12, acc=0.972972972972973\n",
      "epoch=13, acc=0.9864864864864865\n",
      "epoch=14, global_step=300, loss=0.16718952357769012\n",
      "epoch=14, acc=0.9864864864864865\n"
     ]
    }
   ],
   "source": [
    "print(f'before train {evaluate()}')\n",
    "train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "no\n"
     ]
    }
   ],
   "source": [
    "sen = \"你今天怎么样\"\n",
    "id2label = {0:\"no\", 1:\"yes\"}\n",
    "with torch.inference_mode():\n",
    "    inputs = tokenizer(sen, padding=\"max_length\", truncation=True, max_length=128, return_tensors=\"pt\")\n",
    "    if torch.cuda.is_available():\n",
    "        inputs = {k:v.cuda() for k, v in inputs.items()}\n",
    "    outputs = model(**inputs)\n",
    "    pred = outputs.logits.argmax(dim=-1)\n",
    "    print(id2label[pred.item()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Device set to use cuda:0\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[{'label': 'no', 'score': 0.8589010238647461},\n",
       " {'label': 'no', 'score': 0.9369204044342041},\n",
       " {'label': 'no', 'score': 0.8544386625289917},\n",
       " {'label': 'yes', 'score': 0.7509503364562988},\n",
       " {'label': 'no', 'score': 0.8854764699935913},\n",
       " {'label': 'yes', 'score': 0.9277355670928955},\n",
       " {'label': 'yes', 'score': 0.9519011378288269},\n",
       " {'label': 'yes', 'score': 0.8795883655548096}]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sen = [\"服从命令, 芙宁娜女士\",\n",
    "       \"你今天怎么样\",\n",
    "       \"我要去上班了\",\n",
    "       \"给我今天的新闻\",\n",
    "       \"你是谁\",\n",
    "       \"看一下最近的股票行情\", \n",
    "       \"给我讲解一下这一个项目的情况\",\n",
    "       \"项目的进度怎么样了\",]\n",
    "from transformers import pipeline\n",
    "model.config.id2label = id2label\n",
    "pipe = pipeline(\"text-classification\", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)\n",
    "pipe(sen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "#获取时间生成名字\n",
    "import time\n",
    "import os\n",
    "def get_time():\n",
    "    return time.strftime(\"%Y_%m_%d_%H_%M_%S\", time.localtime())\n",
    "model_name = f\"model_furina/model_{get_time()}.pth\" \n",
    "torch.save(model, model_name)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\jiao\\AppData\\Local\\Temp\\ipykernel_20524\\3096703946.py:2: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
      "  model = torch.load(model_name)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'model_furina/model_2025_02_05_12_07_44.pth'"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 加载模型\n",
    "model = torch.load(model_name)\n",
    "model.eval()\n",
    "model_name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Device set to use cuda:0\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[{'label': 'no', 'score': 0.8589010238647461},\n",
       " {'label': 'no', 'score': 0.9369204044342041},\n",
       " {'label': 'no', 'score': 0.8544386625289917},\n",
       " {'label': 'yes', 'score': 0.7509503364562988},\n",
       " {'label': 'no', 'score': 0.8854764699935913},\n",
       " {'label': 'yes', 'score': 0.9277355670928955},\n",
       " {'label': 'yes', 'score': 0.9519011378288269},\n",
       " {'label': 'yes', 'score': 0.9043154120445251}]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sen = [\"服从命令, 芙宁娜女士\",\n",
    "       \"你今天怎么样\",\n",
    "       \"我要去上班了\",\n",
    "       \"给我今天的新闻\",\n",
    "       \"你是谁\",\n",
    "       \"看一下最近的股票行情\",\n",
    "       \"给我讲解一下这一个项目的情况\",\n",
    "       \"项目的情况怎么样了\"]\n",
    "from transformers import pipeline\n",
    "model.config.id2label = id2label\n",
    "\n",
    "pipe = pipeline(\"text-classification\", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)\n",
    "pipe(sen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "transform",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.21"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
